Introduction

Link to competition: https://www.kaggle.com/c/kobe-bryant-shot-selection

!Important: Only train on events that occured prior to the shot being predicted, in order to avoid Leakage

Data load

df <- read_csv(url("https://raw.githubusercontent.com/serbanc94/kobe_bryant_stats/master/data.csv"))
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   action_type = col_character(),
##   combined_shot_type = col_character(),
##   lat = col_double(),
##   lon = col_double(),
##   season = col_character(),
##   shot_type = col_character(),
##   shot_zone_area = col_character(),
##   shot_zone_basic = col_character(),
##   shot_zone_range = col_character(),
##   team_name = col_character(),
##   game_date = col_date(format = ""),
##   matchup = col_character(),
##   opponent = col_character()
## )
## See spec(...) for full column specifications.

Data repair

Some teams changed their names, so the abbreviations changed as well; In the data file the same team appeared under different abbreviations

Brooklyn Nets(BKN) was New Jersey Nets(NJN)

https://en.wikipedia.org/wiki/Brooklyn_Nets

New Orleans Pelicans(NOP) was New Orleans Hornets(NOH)

https://en.wikipedia.org/wiki/New_Orleans_Pelicans

df <- df %>%
  mutate(opponent = replace(opponent, opponent == "BKN", "NJN")) %>%
  mutate(opponent = replace(opponent, opponent == "NOP", "NOH")) %>%
  rowwise() %>%
  mutate(home_match = grepl("vs", matchup))

Injuries that kept him on he bench:

https://pbs.twimg.com/media/BioEZffCAAAIJwr.png

Worth noting: 99-00, 03-04, 04-05,13-14

Data enrichment

Supplimetary CSV with team names abbreviations https://sportsdelve.wordpress.com/abbreviations/

full_names_df <- read_csv(url("https://raw.githubusercontent.com/serbanc94/kobe_bryant_stats/master/team_names.csv"))
## Parsed with column specification:
## cols(
##   opponent = col_character(),
##   opponent_full_name = col_character()
## )
df <- df %>%
  filter(!is.na(shot_made_flag)) %>%
  left_join(full_names_df) %>%
  mutate(shot_value = as.numeric(substring(shot_type, 1, 1)))
## Joining, by = "opponent"

Generate some insights

games_per_season <- df %>%
  group_by(season) %>%
  distinct(game_id) %>%
  summarise(games = n())
## Warning: Grouping rowwise data frame strips rowwise nature
seasonal_stats <- df %>%
  filter(home_match == TRUE) %>%
  group_by(season, shot_made_flag) %>%
  summarise(count = n()) %>%
  spread(shot_made_flag, count) %>%
  mutate(shots_total = `1` + `0`) %>%
  mutate(shot_accuracy = 100 * (`1` / (`1` + `0`))) %>%
  ungroup() %>%
  inner_join(games_per_season) %>%
  mutate(success_per_game = `1` / games) %>%
  select(
    season,
    games,
    shots_success = `1`,
    shots_fail = `0`,
    shots_total,
    shot_accuracy,
    success_per_game
  )
## Warning: Grouping rowwise data frame strips rowwise nature
## Joining, by = "season"
action_type_stats <- df %>%
  mutate(shot_made_flag = as.numeric(shot_made_flag)) %>%
  group_by(action_type, shot_made_flag) %>%
  summarise(count = n()) %>%
  spread(shot_made_flag, count) %>%
  mutate_all(funs(replace(., is.na(.), 0)))
## Warning: Grouping rowwise data frame strips rowwise nature

Visual insights

Action type bar chart

df %>%
  group_by(action_type) %>%
  summarise(count = n()) %>%
  plot_ly(y = ~ count, type = "bar", hoverinfo = "text", text = (~ action_type))
## Warning: Grouping rowwise data frame strips rowwise nature
# Shots stats by season
seasonal_stats_graph <- plot_ly(seasonal_stats,
  x = ~ `season`,
  y = ~ shots_success,
  type = "bar",
  color = ~ `shot_accuracy`,
  hoverinfo = "text",
  text = ~ paste(
    "Full stats, season ", season, "</br>:",
    "</br> Total shots: ", shots_total,
    "</br> Failed shots: ", shots_fail,
    "</br> Success shots: ", shots_success,
    "</br> Percentage: ", round(shot_accuracy, 2), "%"
  )
) %>%
  layout(
    title = "Shots statistics by season",
    yaxis = list(
      title = "# of succesful shots"
    )
  )

seasonal_stats_graph
## Warning: Numeric color variables cannot (yet) be mapped to text.
## Feel free to make a feature request 
## https://github.com/plotly/plotly.js
# Scatterplot
loc_df <- df %>%
  select(loc_x, loc_y, shot_made_flag) %>%
  group_by(loc_x, loc_y, shot_made_flag) %>%
  summarize(count = n()) %>%
  spread(shot_made_flag, count) %>%
  arrange(desc(`1`)) %>%
  mutate_all(funs(replace(., is.na(.), 0))) %>%
  rowwise() %>%
  mutate(percentage = `1` / (`1` + `0`)) %>%
  plot_ly(
    x = ~ loc_x,
    y = ~ loc_y,
    color = ~ percentage,
    type = "scatter"
  )
## Warning: Grouping rowwise data frame strips rowwise nature
loc_df
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
# Shot frequency heatmap
inferno_colors <- inferno(100)

df %>%
  filter(abs(loc_x) > 1, abs(loc_y) > 1) %>%
  ggplot() +
  stat_density_2d(
    aes(
      x = loc_x, y = loc_y,
      fill = ..density..
    ),
    geom = "raster", contour = FALSE, interpolate = TRUE, n = 200
  ) +
  scale_fill_gradientn(colors = inferno_colors, guide = FALSE)

# Hexplot

ggplot(data = df) +
  geom_hex(aes(x = loc_x, y = loc_y), binwidth = c(15, 15)) +
  scale_fill_gradient(trans = "log", low = "blue", high = "red") +
  facet_wrap(~ shot_made_flag) +
  coord_fixed() +
  ggtitle("Misses vs makes")

# Bubble chart of accuracy percentages

loc_matrix <- as.matrix(cbind(df$loc_x, df$loc_y))
loc_cluster <- kmeans(loc_matrix, centers = 700)
df$loc_cluster <- loc_cluster$cluster


cluster_percentages <- df %>%
  group_by(loc_cluster, shot_made_flag) %>%
  summarize(count = n()) %>%
  spread(shot_made_flag, count) %>%
  mutate_all(funs(replace(., is.na(.), 0))) %>%
  mutate(total = as.integer(`1` + `0`)) %>%
  mutate(percentage = `1` / total)
## Warning: Grouping rowwise data frame strips rowwise nature
cluster_percentages$center_x <- loc_cluster$centers[, 1]
cluster_percentages$center_y <- loc_cluster$centers[, 2]

cluster_percentages %>%
  mutate(reg_total = log(total)) %>%
  plot_ly(
    x = ~ center_x,
    y = ~ center_y,
    type = "scatter",
    mode = "markers",
    color = ~ percentage,
    colors = "Reds",
    marker = list(
      size = ~ reg_total * 1.5,
      opacity = ~ percentage * 3
    )
  ) %>%
  layout(
    plot_bgcolor = "rgb(120,120,120)"
  )